##############
#LIBRARIES####
##############
library(readr)
library(skimr)# Beautiful Summarize
library(cleandata)# ordinal encoding
library(onehot)# nominal encoding
library(ggplot2)
library(ggpubr)
library(easyGgplot2)
library(forcats)
library(PerformanceAnalytics) # Correlations
library(corrplot)
library(dplyr) # select
library(factoextra)
library("NbClust")
library(cluster)
library(zoo)
library(ggfortify)
require(forecast)
raw_data<-read_csv("H1.csv")
Parsed with column specification:
cols(
.default = col_double(),
ArrivalDateMonth = [31mcol_character()[39m,
Meal = [31mcol_character()[39m,
Country = [31mcol_character()[39m,
MarketSegment = [31mcol_character()[39m,
DistributionChannel = [31mcol_character()[39m,
ReservedRoomType = [31mcol_character()[39m,
AssignedRoomType = [31mcol_character()[39m,
DepositType = [31mcol_character()[39m,
Agent = [31mcol_character()[39m,
Company = [31mcol_character()[39m,
CustomerType = [31mcol_character()[39m,
ReservationStatus = [31mcol_character()[39m,
ReservationStatusDate = [34mcol_date(format = "")[39m
)
See spec(...) for full column specifications.
unique(raw_data$IsCanceled[raw_data$ReservationStatus == 'Check-Out'])
[1] 0
unique(raw_data$IsCanceled[raw_data$ReservationStatus == 'No-Show'])
[1] 1
unique(raw_data$IsCanceled[raw_data$ReservationStatus == 'Canceled'])
[1] 1
raw_data_encoded = raw_data
# ORDINAL ENCODING
#ArrivalDateMonth
levels <- c('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December')
raw_data_encoded$ArrivalDateMonth = factor(raw_data_encoded$ArrivalDateMonth, order = TRUE , levels)
x <- as.data.frame(raw_data_encoded$ArrivalDateMonth)
raw_data_encoded$ArrivalDateMonth <- encode_ordinal( x, levels, none='', out.int=FALSE,
full_print=TRUE)
raw_data_encoded$ArrivalDateMonth
August : 4894
July : 4573
April : 3609
May : 3559
October: 3555
March : 3336
(Other):16534
coded 1 cols 12 levels
raw_data_encoded$ArrivalDateMonth
8 : 4894
7 : 4573
4 : 3609
5 : 3559
10 : 3555
3 : 3336
(Other):16534
raw_data_encoded$ArrivalDateMonth <- as.numeric(unlist(raw_data_encoded$ArrivalDateMonth))
print(dim(raw_data_encoded))
[1] 40060 31
# NOMINAL ENCODING - ONEHOT ENCODING
# https://github.com/Zelazny7/onehot
# ReservedRoomType
ReservedRoomType <- as.data.frame(raw_data_encoded$ReservedRoomType)
encoder <- onehot(ReservedRoomType, max_levels = 15, add_NA_factors = FALSE)
ReservedRoomType_onehot<- predict(encoder, ReservedRoomType, stringsAsFactors=TRUE)
raw_data_encoded <- cbind(raw_data_encoded, ReservedRoomType_onehot)
print(dim(raw_data_encoded))
[1] 40060 41
# AssignedRoomType
AssignedRoomType <- as.data.frame(raw_data_encoded$AssignedRoomType)
encoder <- onehot(AssignedRoomType, max_levels = 15, add_NA_factors = FALSE)
AssignedRoomType_onehot<- predict(encoder, AssignedRoomType, stringsAsFactors=TRUE)
raw_data_encoded <- cbind(raw_data_encoded, AssignedRoomType_onehot)
print(dim(raw_data_encoded))
[1] 40060 52
# Meal
Meal <- as.data.frame(raw_data_encoded$Meal)
encoder <- onehot(Meal, max_levels = 15, add_NA_factors = FALSE)
Meal_onehot<- predict(encoder, Meal, stringsAsFactors=TRUE)
raw_data_encoded <- cbind(raw_data_encoded, Meal_onehot)
print(dim(raw_data_encoded))
[1] 40060 57
# Country
Country <- as.data.frame(raw_data_encoded$Country)
encoder <- onehot(Country, max_levels = 130, add_NA_factors = FALSE)
Country_onehot<- predict(encoder, Country, stringsAsFactors=TRUE)
raw_data_encoded <- cbind(raw_data_encoded, Country_onehot)
print(dim(raw_data_encoded))
[1] 40060 183
# MarketSegment
MarketSegment <- as.data.frame(raw_data_encoded$MarketSegment)
encoder <- onehot(MarketSegment, max_levels = 15, add_NA_factors = FALSE)
MarketSegment_onehot<- predict(encoder, MarketSegment, stringsAsFactors=TRUE)
raw_data_encoded <- cbind(raw_data_encoded, MarketSegment_onehot)
# DistributionChannel
DistributionChannel <- as.data.frame(raw_data_encoded$DistributionChannel)
encoder <- onehot(DistributionChannel, max_levels = 15, add_NA_factors = FALSE)
DistributionChannel_onehot<- predict(encoder, DistributionChannel, stringsAsFactors=TRUE)
raw_data_encoded <- cbind(raw_data_encoded, DistributionChannel_onehot)
# DepositType
DepositType <- as.data.frame(raw_data_encoded$DepositType)
encoder <- onehot(DepositType, max_levels = 15, add_NA_factors = FALSE)
DepositType_onehot<- predict(encoder, DepositType, stringsAsFactors=TRUE)
raw_data_encoded <- cbind(raw_data_encoded, DepositType_onehot)
# CustomerType
CustomerType <- as.data.frame(raw_data_encoded$CustomerType)
encoder <- onehot(CustomerType, max_levels = 15, add_NA_factors = FALSE)
CustomerType_onehot<- predict(encoder, CustomerType, stringsAsFactors=TRUE)
raw_data_encoded <- cbind(raw_data_encoded, CustomerType_onehot)
# Agent
raw_data_encoded$Agent <- ifelse(raw_data_encoded$Agent == "NULL", 0,raw_data_encoded$Agent)
raw_data_encoded$Agent <- as.numeric(raw_data_encoded$Agent)
# Company
raw_data_encoded$Company <- ifelse(raw_data_encoded$Company == "NULL", 0,raw_data_encoded$Company)
raw_data_encoded$Company <- as.numeric(raw_data_encoded$Company)
raw_data_encoded$AgentYes <- ifelse(raw_data_encoded$Agent == 0, 0, 1)
raw_data_encoded$CompanyYes <- ifelse(raw_data_encoded$Company == 0, 0, 1)
timeSeriesDataSet = raw_data_encoded
Error: object 'raw_data_encoded' not found
set.seed(123)
#Encoded data sample
raw_data_encoded_sample = sample_n(raw_data_encoded, size = 1000)
Error in sample_n(raw_data_encoded, size = 1000) :
object 'raw_data_encoded' not found
#isCanceled
isCanceled_sample = raw_data_encoded_sample[,1]
target <- as.data.frame(raw_data_encoded_sample %>%group_by(IsCanceled)%>%summarise(counts = n()))%>%mutate(perc = counts/nrow(raw_data_encoded_sample))
ggplot(target, aes(x = IsCanceled, y = perc)) + geom_bar(stat = "identity")+
geom_text(aes(label = round(perc,2)))
#Predictive Variables: all of them except:
# - IsCanceled (Target Variable) and ArrivalDateYear.
# We erase ArrivalDateYear because we won't our models to be time replicable.
predictiveVariables_sample = raw_data_encoded_sample[,c(-1,-3)]
raw_data_prediction <- raw_data_encoded[-3]
Error: object 'raw_data_encoded' not found
target <- as.data.frame(raw_data %>%group_by(IsCanceled)%>%summarise(counts = n()))%>%mutate(perc = counts/nrow(raw_data))
ggplot(target, aes(x = IsCanceled, y = perc)) + geom_bar(stat = "identity")+
geom_text(aes(label = round(perc,2)))
General Histogram Histogram based on type 0-1 Boxplot
#Plot function
eda_plot <- function(df_function, character, column){
df_function <- df_function[,c(1,column)]
colnames(df_function) <- c("IsCanceled", "Variable" )
#############
# HISTOGRAMS#
#############
bxp <- ggplot(df_function) +
geom_density(aes(x = Variable, fill = IsCanceled), alpha = 0.2)+
xlab(character)
dp <- ggplot2.histogram(data=df_function, xName= 'Variable', addMeanLine=TRUE, meanLineColor="green",
meanLineType="dashed", meanLineSize=1,
addDensityCurve=TRUE, densityFill='blue', fill = 'blue')+ xlab(character)
##################################
# RELATIONSHIPS BETWEEN VARIABLES########################################
##################################
bp <- df_function %>%
mutate(class = fct_reorder(IsCanceled,Variable, .fun='length' )) %>%
ggplot( aes(x=IsCanceled, y= Variable, fill=class)) +
geom_boxplot() +
xlab("IsCanceled") +
ylab(character)+
theme(legend.position="none") +
xlab("") +
xlab("")
ggarrange(bxp, dp, bp ,
labels = c("A", "B", "C"),
ncol = 2, nrow = 2)
}
raw_data$IsCanceled <- factor(raw_data$IsCanceled)
#Plot function
eda_plot_categorical <- function(df_function, character, column){
df_function <- df_function[,c(1,column)]
colnames(df_function) <- c("IsCanceled", "Variable" )
#############
# HISTOGRAMS#
#############
bxp <- ggplot(df_function) +
geom_density(aes(x = Variable, fill = IsCanceled), alpha = 0.2)+
xlab(character)
dp <- ggplot2.histogram(data=df_function, xName= 'Variable', addMeanLine=TRUE, meanLineColor="green",
meanLineType="dashed", meanLineSize=1,
addDensityCurve=TRUE, densityFill='blue', fill = 'blue')+ xlab(character)
ggarrange(bxp, dp,
labels = c("A", "B"),
ncol = 2)
}
#Plot function
eda_plot_categorical <- function(df_function, character, column){
df_function <- df_function[,c(1,column)]
colnames(df_function) <- c("IsCanceled", "Variable" )
#############
# HISTOGRAMS#
#############
bxp <- ggplot(df_function) +
geom_density(aes(x = Variable, fill = IsCanceled), alpha = 0.2)+
xlab(character)
bxp
}
eda_plot(raw_data, "LeadTime", column = 2)
eda_plot(raw_data, "ArrivalDateYear", column = 3)
eda_plot_categorical(raw_data, "ArrivalDateMonth", column = 4)
eda_plot(raw_data, "ArrivalDateWeekNumber", column = 5)
eda_plot(raw_data, "ArrivalDateDayOfMonth", column = 6)
eda_plot(raw_data, "StaysInWeekendNights", column = 7)
eda_plot(raw_data, "StaysInWeekNights", column = 8)
eda_plot(raw_data, "Adults", column = 9)
eda_plot(raw_data, "Children", column = 10)
eda_plot(raw_data, "Babies", column = 11)
eda_plot_categorical(raw_data, "Meal", column = 12)
eda_plot_categorical(raw_data, "Country", column = 13)
eda_plot_categorical(raw_data, "MarketSegment", column = 14)
eda_plot_categorical(raw_data, "DistributionChannel", column = 15)
eda_plot(raw_data, "IsRepeatedGuest", column = 16)
eda_plot(raw_data, "PreviousCancellations", column = 17)
eda_plot(raw_data, "PreviousBookingsNotCanceled", column = 18)
eda_plot_categorical(raw_data, "ReservedRoomType", column = 19)
eda_plot_categorical(raw_data, "AssignedRoomType", column = 20)
eda_plot(raw_data, "BookingChanges", column = 21)
eda_plot_categorical(raw_data, "DepositType", column = 22)
eda_plot_categorical(raw_data, "Agent", column = 23)
eda_plot_categorical(raw_data, "Company", column = 24)
eda_plot_categorical(raw_data, "DaysInWaitingList", column = 25)
eda_plot_categorical(raw_data, "CustomerType", column = 26)
eda_plot(raw_data, "ADR", column = 27)
eda_plot(raw_data, "RequiredCarParkingSpaces", column = 28)
eda_plot(raw_data, "TotalOfSpecialRequests", column = 29)
eda_plot_categorical(raw_data, "ReservationStatus", column = 30)
raw_data_encoded$IsCanceled <- factor(raw_data_encoded$IsCanceled)
eda_plot_categorical(raw_data_encoded, "AgentYes", column = 191)
eda_plot_categorical(raw_data_encoded, "CompanyYes", column = 192)
corrplot(cor(raw_data_encoded[,2:21],
use = "complete.obs"),
method = "circle",
type = 'upper',
order = "hclust",
addrect = 2,
tl.cex = 0.4)
chart.Correlation(predictiveVariables_sample[,1:19],
histogram = TRUE, pch = 19)
highcorrelation <- raw_data_encoded
highcorrelation$IsCanceled <- as.numeric(highcorrelation$IsCanceled)
corrplot(cor(highcorrelation[,1:21],
use = "complete.obs"),
method = "circle",
type = 'upper',
order = "hclust",
addrect = 2,
tl.cex = 0.4)
#Check initial correlation between variables and IsCanceled based on a coefficient.
newData.cor = cor(highcorrelation[,1:21],
use = "complete.obs")
corrplot(newData.cor)
corrplot(newData.cor, type = "upper", order = "hclust", tl.col = "black", tl.srt = 45)
newData.cor <- data.frame(newData.cor)
correlated <- c()
variable <- c()
coefficient_limit <- c()
select_columns <- c()
coeff <- 0.1
for(column in 1:dim(newData.cor)[1]){
if(abs(newData.cor[1,column]) > coeff){
variable <- append(variable, names(newData.cor)[column])
correlated <- append(correlated, newData.cor[1,column] )
coefficient_limit <- append(coefficient_limit,coeff)
}
}
correlated_variables <- data.frame(variable,correlated, coefficient_limit)
# correlated_variables <- data.frame(variable,correlated)
print(correlated_variables)
#############################################
# CONTINGENCY TABLE OF CATEGORICAL VARIABLES#
#############################################
# IsCanceled VS ArrivalDateMonth#
#################################
## two-way contingency table of categorical outcome and predictors
contingency_table <- xtabs(~IsCanceled+ArrivalDateMonth, data = raw_data)
contingency_table_percentage <- c()
ArrivalDateMonth <- 0
for( ArrivalDateMonth in 1:dim(contingency_table)[2]){
contingency_table_percentage[ArrivalDateMonth] = contingency_table[2, ArrivalDateMonth]/contingency_table[1, ArrivalDateMonth]*100
}
barplot(contingency_table_percentage,
main = "IsCanceled % vs ArrivalDateMonth",
xlab = "ArrivalDateMonth",
ylab = "IsCanceled %",
names.arg = c("April", "August","December","February","January", "July","June", "March", "May", "November", "October", "September"),
col = "darkred",
horiz = FALSE)
default_total <- sum(contingency_table[2,])
# IsCanceled VS Meal#
#################################
## two-way contingency table of categorical outcome and predictors
contingency_table <- xtabs(~IsCanceled+Meal, data = raw_data)
contingency_table_percentage <- c()
Meal <- 0
for( Meal in 1:dim(contingency_table)[2]){
contingency_table_percentage[Meal] = contingency_table[2, Meal]/contingency_table[1, Meal]*100
}
barplot(contingency_table_percentage,
main = "IsCanceled % vs Meal",
xlab = "Meal",
ylab = "IsCanceled %",
names.arg = c("BB", "FB", "HB", "SC", "Undefined"),
col = "darkred",
horiz = FALSE)
default_total <- sum(contingency_table[2,])
# IsCanceled VS MarketSegment#
#################################
## two-way contingency table of categorical outcome and predictors
contingency_table <- xtabs(~IsCanceled+MarketSegment, data = raw_data)
contingency_table_percentage <- c()
MarketSegment <- 0
for( MarketSegment in 1:dim(contingency_table)[2]){
contingency_table_percentage[MarketSegment] = contingency_table[2, MarketSegment]/contingency_table[1, MarketSegment]*100
}
barplot(contingency_table_percentage,
main = "IsCanceled % vs MarketSegment",
xlab = "MarketSegment",
ylab = "IsCanceled %",
names.arg = c("Complementary", "Corporate","Direct","Groups", "Offline TA/TO", "Online TA"),
col = "darkred",
horiz = FALSE)
default_total <- sum(contingency_table[2,])
# IsCanceled VS DistributionChannel#
#################################
## two-way contingency table of categorical outcome and predictors
contingency_table <- xtabs(~IsCanceled+DistributionChannel, data = raw_data)
contingency_table_percentage <- c()
DistributionChannel <- 0
for( DistributionChannel in 1:dim(contingency_table)[2]){
contingency_table_percentage[DistributionChannel] = contingency_table[2, DistributionChannel]/contingency_table[1, DistributionChannel]*100
}
barplot(contingency_table_percentage,
main = "IsCanceled % vs DistributionChannel",
xlab = "DistributionChannel",
ylab = "IsCanceled %",
names.arg = c("Corporate", "Direct", "TA/TO", "Undefined"),
col = "darkred",
horiz = FALSE)
default_total <- sum(contingency_table[2,])
# IsCanceled VS ReservedRoomType#
#################################
## two-way contingency table of categorical outcome and predictors
contingency_table <- xtabs(~IsCanceled+ReservedRoomType, data = raw_data)
contingency_table_percentage <- c()
ReservedRoomType <- 0
for( ReservedRoomType in 1:dim(contingency_table)[2]){
contingency_table_percentage[ReservedRoomType] = contingency_table[2, ReservedRoomType]/contingency_table[1, ReservedRoomType]*100
}
contingency_table
barplot(contingency_table_percentage[-10],
main = "IsCanceled % vs ReservedRoomType",
xlab = "ReservedRoomType",
ylab = "IsCanceled %",
names.arg = c("A", "B", "C", "D","E","F","G","H","L"),
col = "darkred",
horiz = FALSE)
default_total <- sum(contingency_table[2,])
# IsCanceled VS AssignedRoomType#
#################################
## two-way contingency table of categorical outcome and predictors
contingency_table <- xtabs(~IsCanceled+AssignedRoomType, data = raw_data)
contingency_table_percentage <- c()
AssignedRoomType <- 0
for( AssignedRoomType in 1:dim(contingency_table)[2]){
contingency_table_percentage[AssignedRoomType] = contingency_table[2, AssignedRoomType]/contingency_table[1, AssignedRoomType]*100
}
barplot(contingency_table_percentage[-c(10,11)],
main = "IsCanceled % vs AssignedRoomType",
xlab = "AssignedRoomType",
ylab = "IsCanceled %",
names.arg = c("A", "B", "C", "D","E","F","G","H","I"),
col = "darkred",
horiz = FALSE)
default_total <- sum(contingency_table[2,])
# IsCanceled VS DepositType#
#################################
## two-way contingency table of categorical outcome and predictors
contingency_table <- xtabs(~IsCanceled+DepositType, data = raw_data)
contingency_table_percentage <- c()
DepositType <- 0
for( DepositType in 1:dim(contingency_table)[2]){
contingency_table_percentage[DepositType] = contingency_table[2, DepositType]/contingency_table[1, DepositType]*100
}
barplot(contingency_table_percentage,
main = "IsCanceled % vs DepositType",
xlab = "DepositType",
ylab = "IsCanceled %",
names.arg = c("No Deposit", "Non Refund", "Refundable"),
col = "darkred",
horiz = FALSE)
default_total <- sum(contingency_table[2,])
# IsCanceled VS CustomerType#
#################################
## two-way contingency table of categorical outcome and predictors
contingency_table <- xtabs(~IsCanceled+CustomerType, data = raw_data)
contingency_table_percentage <- c()
CustomerType <- 0
for( CustomerType in 1:dim(contingency_table)[2]){
contingency_table_percentage[CustomerType] = contingency_table[2, CustomerType]/contingency_table[1, CustomerType]*100
}
barplot(contingency_table_percentage,
main = "IsCanceled % vs CustomerType",
xlab = "CustomerType",
ylab = "IsCanceled %",
names.arg = c("Contract", "Group", "Transient", "Transient-Party"),
col = "darkred",
horiz = FALSE)
default_total <- sum(contingency_table[2,])
# IsCanceled VS CompanyYes#
#################################
## two-way contingency table of categorical outcome and predictors
contingency_table <- xtabs(~IsCanceled+CompanyYes, data = raw_data_encoded)
contingency_table_percentage <- c()
CompanyYes <- 0
for( CompanyYes in 1:dim(contingency_table)[2]){
contingency_table_percentage[CompanyYes] = contingency_table[2, CompanyYes]/contingency_table[1, CompanyYes]*100
}
barplot(contingency_table_percentage,
main = "IsCanceled % vs CompanyYes",
xlab = "CompanyYes",
ylab = "IsCanceled %",
names.arg = c(0, 1),
col = "darkred",
horiz = FALSE)
default_total <- sum(contingency_table[2,])
# IsCanceled VS AgentYes#
#################################
## two-way contingency table of categorical outcome and predictors
contingency_table <- xtabs(~IsCanceled+AgentYes, data = raw_data_encoded)
contingency_table_percentage <- c()
AgentYes <- 0
for( AgentYes in 1:dim(contingency_table)[2]){
contingency_table_percentage[AgentYes] = contingency_table[2, AgentYes]/contingency_table[1, AgentYes]*100
}
barplot(contingency_table_percentage,
main = "IsCanceled % vs AgentYes",
xlab = "AgentYes",
ylab = "IsCanceled %",
names.arg = c(0, 1),
col = "darkred",
horiz = FALSE)
default_total <- sum(contingency_table[2,])
Clustering: - Goodness of the cluster Analysis - Optimal Number of Clusters Clustering
#Scaled sample
predictiveVariables_sampleScaled = scale(predictiveVariables_sample)
predictiveVariables_sampleScaled = as.data.frame(predictiveVariables_sampleScaled)
#Columns with all zeros from the sample
predictiveVariablesNonZero = predictiveVariables_sample[!sapply(predictiveVariables_sample, function(x) sum(x)== 0)]
predictiveVariablesNonZeroScaled = scale(predictiveVariablesNonZero)
library(cluster)
gower_dist = daisy(as.matrix(predictiveVariables_sample) , metric = "euclidean", stand = FALSE)
bondad_ac = get_clust_tendency(predictiveVariables_sample, 500)
bondad_ac$hopkins_stat
clus.nb = NbClust(predictiveVariablesNonZeroScaled, distance = "euclidean",min.nc = 2, max.nc = 10,method = "complete", index ="gap")
clus.nb$Best.nc
# Calculate silhouette width for many k using PAM
sil_width <- c(NA)
for(i in 2:10){
pam_fit <- pam(gower_dist,
diss = TRUE,
k = i)
sil_width[i] <- pam_fit$silinfo$avg.width
}
# Plot sihouette width (higher is better)
plot(1:10, sil_width,
xlab = "Optimal k clusters",
ylab = "Average Width Profile")
lines(1:10, sil_width)
# Only numeric Variables for clustering
numericVariables <- predictiveVariablesNonZero[c(17,7,2,3,9,13,8,16,1,12,11,18,5,6,19)]
fviz_nbclust(numericVariables, cluster::clara, method = "silhouette") + ggtitle("Optimal number of clusters - CLARA") + labs(x = "Optimal k clusters", y = "Average Width Profile")
# 2 groups
The optimal number of clusters will be given by the value of k that maximizes the average profile. In this case: 2
# Calculate silhouette width for many k using
sil_width <- c(NA)
for(i in 2:10){
fanny_fit <- fanny(gower_dist,
diss = TRUE,
k = i)
sil_width[i] <- fanny_fit$silinfo$avg.width
}
# Plot sihouette width (higher is better)
plot(1:10, sil_width,
xlab = "Number of clusters",
ylab = "Silhouette Width")
lines(1:10, sil_width)
aggl.clust.c <- hclust(gower_dist, method = "complete")
library(fpc)
cstats.table <- function(dist, tree, k) {
clust.assess <- c("cluster.number","n","within.cluster.ss","average.within","average.between","wb.ratio","dunn2","avg.silwidth")
clust.size <- c("cluster.size")
stats.names <- c()
row.clust <- c()
output.stats <- matrix(ncol = k, nrow = length(clust.assess))
cluster.sizes <- matrix(ncol = k, nrow = k)
for (i in c(1:k)) {
row.clust[i] <- paste("Cluster-", i, " size")
}
for (i in c(2:k)) {
stats.names[i] <- paste("Test", i - 1)
for (j in seq_along(clust.assess)) {
output.stats[j, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.assess])[j]
}
for (d in 1:k) {
cluster.sizes[d, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.size])[d]
dim(cluster.sizes[d, i]) <- c(length(cluster.sizes[i]), 1)
cluster.sizes[d, i]
}
}
output.stats.df <- data.frame(output.stats)
cluster.sizes <- data.frame(cluster.sizes)
cluster.sizes[is.na(cluster.sizes)] <- 0
rows.all <- c(clust.assess, row.clust)
# rownames(output.stats.df) <- clust.assess
output <- rbind(output.stats.df, cluster.sizes)[ ,-1]
colnames(output) <- stats.names[2:k]
rownames(output) <- rows.all
is.num <- sapply(output, is.numeric)
output[is.num] <- lapply(output[is.num], round, 2)
output
}
ggplot(data = data.frame(t(cstats.table(gower_dist, aggl.clust.c, 15))),
aes(x = cluster.number, y = avg.silwidth)) +
geom_point() +
geom_line() +
ggtitle("Optimal number of clusters - HCUT") +
labs(x = "Optimal k clusters", y = "Average Width Profile") +
theme(plot.title = element_text(hjust = 0.5))
plot(aggl.clust.c, main = "HCUT ")
rect.hclust(aggl.clust.c, k = 2, border = 2:3)
set.seed(123)
grp <- cutree(aggl.clust.c, k = 2)
fviz_cluster(list(data = predictiveVariables_sample, cluster = grp), stand = FALSE, geom = "point", pointsize = 1, title = "Cluster Plot")
clust.num <- cutree(aggl.clust.c, k = 2)
tabl = prop.table(table(clust.num,isCanceled_sample))*100
tabl
tabl[1,2]/(tabl[1]+ tabl[1,2])*100 # % of Cancelations of Group 1
tabl[2,2]/(tabl[2]+tabl[2,2])*100 # % of Cancelations of Group 2
grp <- cutree(aggl.clust.c, k = 2)
# aggl.clust.c <- hclust(gower_dist, method = "complete")
sil_cl <- silhouette(grp,gower_dist, title=title(main = 'Good'))
# rownames(sil_cl) <- rownames(grp)
plot(sil_cl)
library(Rtsne)
pam_fit <- pam(gower_dist, diss = TRUE, k = 4)
tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)
tsne_data <- tsne_obj$Y %>%
data.frame() %>%
setNames(c("X", "Y")) %>%
mutate(cluster = factor(pam_fit$clustering))
ggplot(aes(x = X, y = Y), data = tsne_data) +
geom_point(aes(color = cluster))
pam_fit$clustering<- as.factor(pam_fit$clustering)
tabl = prop.table(table(pam_fit$clustering,isCanceled_sample))*100
tabl
tabl[1,2]/(tabl[1]+ tabl[1,2])*100 # % of Cancelations of Group 1
tabl[2,2]/(tabl[2]+tabl[2,2])*100 # % of Cancelations of Group 2
tabl[3,2]/(tabl[3]+tabl[3,2])*100 # % of Cancelations of Group 3
tabl[4,2]/(tabl[4]+tabl[4,2])*100 # % of Cancelations of Group 4
# tabl[5,2]/(tabl[5]+tabl[5,2])*100 # % of Cancelations of Group 5
# tabl[6,2]/(tabl[6]+tabl[6,2])*100 # % of Cancelations of Group 6
# tabl[7,2]/(tabl[7]+tabl[7,2])*100 # % of Cancelations of Group 7
# tabl[8,2]/(tabl[8]+tabl[8,2])*100 # % of Cancelations of Group 8
# tabl[9,2]/(tabl[9]+tabl[9,2])*100 # % of Cancelations of Group 9
# tabl[10,2]/(tabl[10]+tabl[10,2])*100 # % of Cancelations of Group 10
fviz_silhouette(pam_fit)
library(cluster)
claraC2 = clara(numericVariables, 2,
samples = 1000, correct.d = FALSE)
fviz_cluster(claraC2, geom = "point", pointsize = 1, ellipse.type = "norm")
claraC2$clustering <- as.factor(claraC2$clustering)
tabl = prop.table(table(claraC2$clustering,isCanceled_sample))*100
tabl
tabl[1,2]/(tabl[1] + tabl[1,2])*100 # % of Cancelations of Group 1
tabl[2,2]/(tabl[2] + tabl[2,2])*100 # % of Cancelations of Group 2
fviz_silhouette(claraC2)
library(Rtsne)
fanny_fit <- fanny(gower_dist, diss = TRUE, k = 2)
tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)
tsne_data <- tsne_obj$Y %>%
data.frame() %>%
setNames(c("X", "Y")) %>%
mutate(cluster = factor(fanny_fit$clustering))
ggplot(aes(x = X, y = Y), data = tsne_data) +
geom_point(aes(color = cluster))
# Coeficiente de segmentación normalizado:
fanny_fit$coeff
fanny_fit$clustering <- as.factor(fanny_fit$clustering)
tabl = prop.table(table(fanny_fit$clustering,isCanceled_sample))*100
tabl
tabl[1,2]/(tabl[1]+ tabl[1,2])*100 # % of Cancelations of Group 1
tabl[2,2]/(tabl[2]+tabl[2,2])*100 # % of Cancelations of Group 2
fviz_silhouette(fanny_fit)
roomChannel = as.data.frame.matrix(table(raw_data$DistributionChannel, raw_data$ReservedRoomType))
roomChannel
as.table(as.matrix(roomChannel))
roomChannelScaled = scale(roomChannel)
roomChannelScaled
test.ind = chisq.test(roomChannel)
test.ind
# The result of the test confirms the possibility of rejecting the hypotheses of independence between categories of rows and columns, or, what is the same, we can affirm that there is a relationship between them. The rest of the tests presented below allow us to help visualize and interpret the test result.
roomChannelt = as.table(as.matrix(roomChannel))
roomChannelt
library("FactoMineR")
# tab1 <- as.data.frame.matrix(table(as.factor(df$X1),as.factor(df$X2)))
# res.ca <- CA(tab1, graph = FALSE)
library(FactoMineR)
roomChannel.ca=CA(roomChannel, graph = FALSE)
summary(roomChannel.ca)
fviz_ca_biplot(roomChannel.ca, map ="rowprincipal", arrow = c(TRUE, TRUE))
fviz_ca_biplot(roomChannel.ca, map ="colgreen",
arrow = c(TRUE, FALSE))+
ggtitle("Contribution of Distribution Channels to the dimensions")
fviz_ca_biplot(roomChannel.ca, map ="rowgreen",
arrow = c(FALSE, TRUE))+
ggtitle("Contribution of Room Types to the dimensions")
countryRoom = as.data.frame.matrix(table(raw_data$Country, raw_data$ReservedRoomType))
countryRoom
test.ind = chisq.test(countryRoom)
test.ind
# The result of the test confirms the possibility of rejecting the hypotheses of independence between categories of rows and columns, or, what is the same, we can affirm that there is a relationship between them. The rest of the tests presented below allow us to help visualize and interpret the test result.
library(FactoMineR)
countryRoom.ca=CA(countryRoom, graph = FALSE)
summary(countryRoom.ca)
fviz_ca_biplot(countryRoom.ca, map ="rowprincipal", arrow = c(TRUE, TRUE))
fviz_ca_biplot(countryRoom.ca, map ="colgreen",
arrow = c(TRUE, FALSE))+
ggtitle("Contribution of Country to the dimensions")
fviz_ca_biplot(countryRoom.ca, map ="rowgreen",
arrow = c(FALSE, TRUE))+
ggtitle("Contribution of Room Type to the dimensions")
# Only numeric Variables for clustering
numericVariables <- predictiveVariablesNonZero[c(17,7,2,3,9,13,8,16,1,12,11,18,5,6,19)]
normalizeData <- function(x){
m <- mean(x)
s <- sd(x)
x <- (x - m)/s
}
numericVariables <- apply(numericVariables, 2, normalizeData)
In order to apply these unsupervised learning techniques, i.e. PCA, our data set must have some properties:
The determinant of our correlation matrix is 0.04720488. As we have a low determinant near to 0, it makes sense to apply PCA to reduce dimensions, because it also tells us that there is a high correlation between the variables studied.
For reference, Kaiser put the following values on the results: 0.00 to 0.49 unacceptable. 0.50 to 0.59 miserable. 0.60 to 0.69 mediocre. 0.70 to 0.79 middling. 0.80 to 0.89 meritorious. 0.90 to 1.00 marvelous.
So in our case, our variables are meritorious to apply factor analysis as our Overall MSA = 0.8.
In our case the p-value is equal to 0, so the factor analysis will be useful with our data as we are accepting the alternative hypothesis that our variables are correlated enough and diverging from the identity matrix.
After doing all these tests we can apply a data reduction technique such as the Principal Component Analysis with confidence.
A dimensionality reduction technique such as a PCA makes sense to see what variables can be explained in the same dimension.
As we can see in Individuals factor map, the directions of Dim1 and Dim2 are clearly orthogonal and explain 10.4% of the variance:
6% of the variance can be explained in Dim1.
4.4% of the variance can be explained in Dim2.
If the first two factors together explain most of the variability in the original 9 variables, then those factors are clearly a good, simpler substitute for all 9 variables. We can drop the rest without losing much of the original variability.
But if it takes 7 factors to explain most of the variance in those 9 variables, we might as well just use the original 9.
In this case we probably need more than 2 dimensions to explain most of the variance as we are only explaining 53% of the variance.
Here we start seeing that with more dimensions we are able to explain a higher percentage of our dataset but we still don’t see a clear difference between what the different dimensions explain.
All variables contribute to Dim1 or Dim2 above the threshold. If the contribution of the variables were uniform, the expected value would be 1/length(variables)
So this is the expected average contribution that we can use as a threshold.
Using that as a threshold, here we can see the highest contributions to Dim1:
Using that as a threshold, here we can see the highest contributions to Dim2: length, lbh, wskull, ltibio.
The variance percentage and cumulative variance percentage changes less once we get to the fourth or fifth dimension.
El segundo documento, de extensión máxima 6 pp, resolverá un problema de predicción siendo la variable objetivo predecir si la reserva se cancela o no, utilizando las técnicas de regresión estudiadas en la asignatura.
#PREDICTION
Logistic regression is a method for fitting a regression curve, y = f(x), when y is a categorical variable. The typical use of this model is predicting y given a set of predictors x. The predictors can be continuous, categorical or a mix of both. - PREDICTIVE MODEL: Multicolineality is not relevant as we are not searching for an explanatory model. ### TRAIN AND TEST
#TRAINING AND TESTING DATA
set.seed(131822)
n <- nrow(raw_data_prediction)
scaleDF <- scale(raw_data_prediction)
id_train <- sample(1:n , 0.80*n)
df.train <- as.data.frame(scaleDF[id_train,])
df.train$IsCanceled <- raw_data[id_train,]$IsCanceled
df.test <- as.data.frame(scaleDF[-id_train,])
df.test$IsCanceled <- raw_data[-id_train,]$IsCanceled
target <- as.data.frame(df.train %>% group_by(IsCanceled) %>% summarise(counts = n())) %>% mutate(perc = counts/nrow(df.train))
ggplot(target, aes(x = IsCanceled, y = perc)) + geom_bar(stat = "identity") +
geom_text(aes(label = round(perc,2)))
target <- as.data.frame(df.test %>% group_by(IsCanceled) %>% summarise(counts = n())) %>% mutate(perc = counts/nrow(df.test))
ggplot(target, aes(x = IsCanceled, y = perc)) + geom_bar(stat = "identity") +
geom_text(aes(label = round(perc,2)))
#######################
# COST IN TRAINING SET#
#######################
searchgrid = seq(0.0001,0.6, 0.01)
result = cbind(searchgrid, NA)
# in the cost function, both r and pi are vectors, r=truth, pi=predicted probability
cost1 <- function(r, pi){
weight1 <- 1
weight0 <- 1
c1 <- (r == 1) & (pi < pcut) # true if actual 1 but predict 0. FP (False Positive)
c0 <- (r == 0) & (pi > pcut) #true if actual 0 but predict 1. FN (False Negative)
return(mean(weight1 * c1 + weight0 * c0))
}
## Train
datos_train_x <- model.matrix(IsCanceled ~ ., df.train)[, -1]
datos_train_y <- df.train$IsCanceled
## test
datos_test_x <- model.matrix(IsCanceled ~ ., df.test)[, -1]
datos_test_y <- df.test$IsCanceled
df.glm1 <- glm(IsCanceled ~ ., family = binomial, df.train)
glm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurred
prob <- predict(df.glm1,type = "response")
for (i in 1:length(searchgrid))
{
pcut <- result[i,1]
#assign the cost to the 2nd col
result[i,2] <- cost1(df.train$IsCanceled, prob)
}
plot(result, ylab = "Cost in Training Set")
result[which.min(result[,2]),]
searchgrid
0.4401000 0.1512731
# searchgrid
# 0.4341000 0.1508362
#######################
cut_off <- 0.4401000#
#######################
prob.glm1.insample <- predict(df.glm1,type = "response")
predicted.glm1.insample <- (prob.glm1.insample > cut_off)
predicted.glm1.insample <- as.numeric(predicted.glm1.insample)
###################
# CONFUSION MATRIX#
###################
table(df.train$IsCanceled, predicted.glm1.insample, dnn = c("Truth","Predicted"))
Predicted
Truth 0 1
0 21072 2128
1 2720 6128
########
# ERROR#
########
mean(ifelse(df.train$IsCanceled != predicted.glm1.insample, 1, 0))
[1] 0.1512731
mean(ifelse(df.train$IsCanceled == predicted.glm1.insample, 1, 0))
[1] 0.8487269
#######################
cut_off <- 0.4401000#
#######################
prob.glm1.outsample <- predict(df.glm1, type = "response",newdata = df.test)
prediction from a rank-deficient fit may be misleading
predicted.glm1.outsample <- (prob.glm1.outsample > cut_off)
predicted.glm1.outsample <- as.numeric(predicted.glm1.outsample)
###################
# CONFUSION MATRIX#
###################
table(df.test$IsCanceled, predicted.glm1.outsample, dnn = c("Truth","Predicted"))
Predicted
Truth 0 1
0 5221 517
1 689 1585
########
# ERROR#
########
mean(ifelse(df.test$IsCanceled != predicted.glm1.outsample, 1, 0))
[1] 0.1505242
mean(ifelse(df.train$IsCanceled == predicted.glm1.insample, 1, 0))
[1] 0.8487269
Our Area Under the Curve is ROC curve, is a graphical plot that illustrates the diagnostic ability of a binary classifier system as its discrimination threshold is varied. It quantifies the tradeoff we’re making between the TPR (True Positive Rate) and the false positive rate (FPR) at various cutoff settings ( between 0 and 1 ). As TPR increases, FPR increases too, so we want to reach the highest TPR before the FPR increases too much. This means that we want our AUC to be really close to 1.
library('ROCR')
############
# ROC CURVE#####################################################################################################
############
pred <- prediction(prob.glm1.outsample, df.test$IsCanceled)
perf <- performance(pred, "tpr", "fpr")
rocr.auc.lr = as.numeric(performance(pred, "auc")@y.values)
plot(perf, colorize = TRUE,
main = 'ROC Curve')
mtext(paste('Logistic Regression - auc : ', round(rocr.auc.lr, 5)))
#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))
[1] 0.9110479
#
####################################################################
# ODDS OF DEFAULT BASED ON WEIGHTS(COEFFICIENTS) GIVEN TO VARIABLES#
####################################################################
# How odds of default change by changing one unit of the variable.
temp_compare <- as.data.frame(exp(cbind(OR = coef(df.glm1))))
temp_compare
library(data.table)
setDT(temp_compare, keep.rownames = TRUE)[]
temp_compare <- temp_compare[order(temp_compare$OR,decreasing = TRUE),]
temp_compare <- temp_compare[!is.infinite(temp_compare$OR)]
temp_compare <- temp_compare[!is.na(temp_compare$OR)]
library(glmnet)
df.Lasso <- glmnet(datos_train_x, as.factor(datos_train_y), alpha = 1.0, family = "binomia")
prob <- predict(df.Lasso, newx = datos_train_x, type = "response")
res <- c()
for (j in 1:ncol(prob)){
for (i in 1:length(searchgrid))
{
pcut <- result[i,1]
#assign the cost to the 2nd col
result[i,2] <- cost1(df.train$IsCanceled, prob[,j])
}
##plot(result, ylab = "Cost in Training Set")
res <- c(res,as.double(result[which.min(result[,2]),][2]))
}
which(res == min(res))[1]
[1] 98
result = cbind(searchgrid, NA)
for (i in 1:length(searchgrid))
{
pcut <- result[i,1]
#assign the cost to the 2nd col (which(res == min(res))[1])
result[i,2] <- cost1(df.train$IsCanceled, prob[,98])
}
plot(result, ylab = "Cost in Training Set")
result[which.min(result[,2]),]
searchgrid
0.4301000 0.1511483
# searchgrid
# 0.4301000 0.1511483
#######################
cut_off <- 0.4301000#
#######################
prob.Lasso <- as.double(prob[,98])
prob.Lasso <- (prob.Lasso > cut_off)
prob.Lasso <- as.numeric(prob.Lasso)
###################
# CONFUSION MATRIX#
###################
table(df.train$IsCanceled, prob.Lasso, dnn = c("Truth","Predicted"))
Predicted
Truth 0 1
0 20936 2264
1 2580 6268
########
# ERROR#
########
mean(ifelse(df.train$IsCanceled != prob.Lasso, 1, 0))
[1] 0.1511483
#######################
cut_off <- 0.4301000#
#######################
prob <- predict(df.Lasso, newx = datos_test_x, type = "response")
prob.Lasso <- as.double(prob[,98])
prob.Lasso <- (prob.Lasso > cut_off)
prob.Lasso <- as.numeric(prob.Lasso)
###################
# CONFUSION MATRIX#
###################
table(df.test$IsCanceled, prob.Lasso, dnn = c("Truth","Predicted"))
Predicted
Truth 0 1
0 5185 553
1 645 1629
########
# ERROR#
########
mean(ifelse(df.test$IsCanceled != prob.Lasso, 1, 0))
[1] 0.1495257
mean(ifelse(df.train$IsCanceled == prob.Lasso, 1, 0))
[1] 0.6002247
Our Area Under the Curve is ROC curve, is a graphical plot that illustrates the diagnostic ability of a binary classifier system as its discrimination threshold is varied. It quantifies the tradeoff we’re making between the TPR (True Positive Rate) and the false positive rate (FPR) at various cutoff settings ( between 0 and 1 ). As TPR increases, FPR increases too, so we want to reach the highest TPR before the FPR increases too much. This means that we want our AUC to be really close to 1.
############
# ROC CURVE#####################################################################################################
############
pred <- prediction(prob.Lasso, df.test$IsCanceled)
perf <- performance(pred, "tpr", "fpr")
rocr.auc.lr = as.numeric(performance(pred, "auc")@y.values)
plot(perf, colorize = TRUE,
main = 'ROC Curve')
mtext(paste('Logistic Regression - auc : ', round(rocr.auc.lr, 5)))
#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))
[1] 0.8099919
#
####################################################################
# ODDS OF DEFAULT BASED ON WEIGHTS(COEFFICIENTS) GIVEN TO VARIABLES#
####################################################################
# How odds of default change by changing one unit of the variable.
temp_compareL <- as.data.frame((cbind(OR = coef(df.Lasso)[,10])))
temp_compareL
setDT(temp_compareL, keep.rownames = TRUE)[]
temp_compareL <- temp_compareL[order(temp_compareL$OR,decreasing = TRUE),]
temp_compareL <- temp_compareL[!is.infinite(temp_compareL$OR)]
temp_compareL <- temp_compareL[!is.na(temp_compareL$OR)]
temp_compareL
El tercer documento, de extensión máxima 6 pp, resolverá un problema de predicción de series temporales. En primer lugar, se debe elaborar la serie semanal de reservas realizadas. Se deberá plantear un modelo adecuado para predecir esta serie temporal.
Time Series: HOTEL’S MANAGEMENT - Number of Reservations - ArrivalDate - Number of Reservations that will CheckOut - ArrivalDate - Number of Reservations that will not CheckOut - ArrivalDate
HOTEL’S MARKETING
- Number of Reservations that will CheckOut - ReservationDate - Number of Reservations that will not CheckOut- ReservationDate
erase_columns <- c(12, 13, 14, 15, 19, 20, 22, 26, 30)
timeSeriesDataSet <- timeSeriesDataSet[ -erase_columns ]
yearMonthDay <- function(y, m, d){
date <- paste(as.character(y),paste(m, d, sep = "-"), sep = "-")
date <- as.Date(date, "%Y-%m-%d")
}
arrival_date <- yearMonthDay(y = timeSeriesDataSet$ArrivalDateYear,
m = timeSeriesDataSet$ArrivalDateMonth,
d = timeSeriesDataSet$ArrivalDateDayOfMonth)
arrival_date <- as.data.frame(arrival_date)
a = arrival_date %>% group_by(arrival_date) %>% summarise(n = n())
data <- as.xts(a$n,order.by=as.Date(a$arrival_date))
weekly <- apply.weekly(data,sum)
weekly <- as.data.frame(weekly)
#index of a dataframe to columns
library(data.table)
setDT(weekly, keep.rownames = TRUE)[]
weekly$rn <- as.Date(weekly$rn)
arrivalDatePlot <- weekly
ggplot(data = weekly,
aes(x = rn))+
geom_line(aes(y = V1, color = "Number of Reservations for ArrivalDate"))+
ylab('Number of Reservations for that Arrival Date')+
xlab('Date - Weekly')+
theme(legend.position = "none")
yearMonthDay <- function(y, m, d){
date <- paste(as.character(y),paste(m, d, sep = "-"), sep = "-")
date <- as.Date(date, "%Y-%m-%d")
}
arrival_date <- yearMonthDay(y = timeSeriesDataSet$ArrivalDateYear,
m = timeSeriesDataSet$ArrivalDateMonth,
d = timeSeriesDataSet$ArrivalDateDayOfMonth)
reservationDate <- arrival_date - timeSeriesDataSet$LeadTime
reservationDate <- as.data.frame(reservationDate)
# reservationDate <- reservationDate%>%rename( 'reservationDate' = 'arrival_date' )
a = reservationDate %>% group_by(reservationDate) %>% summarise(n = n())
data <- as.xts(a$n,order.by=as.Date(a$reservationDate))
weekly <- apply.weekly(data,sum)
weekly <- as.data.frame(weekly)
#index of a dataframe to columns
library(data.table)
setDT(weekly, keep.rownames = TRUE)[]
weekly$rn <- as.Date(weekly$rn)
reservationDatePlot <- weekly
ggplot(data = weekly,
aes(x = rn))+
geom_line(aes(y = V1, color = "Number of Reservations for Reservation Date"))+
ylab('Number of Reservations for that Reservation Date')+
xlab('Date - Weekly')+
theme(legend.position = "none")
ggplot() +
geom_line(data = arrivalDatePlot, aes(x = rn, y = V1, col = "Arrival Date")) +
geom_line(data = reservationDatePlot, aes(x = rn, y = V1, col = "Reservation Date"))+
xlab('Date - Weeks')+
ylab('Number of Reservations')+
ggtitle('Number of Reservations')
nonCheckoutTS<-timeSeriesDataSet%>%filter(IsCanceled == 1)
checkoutTS <-timeSeriesDataSet%>%filter(IsCanceled == 0)
arrival_dateCheckout <- yearMonthDay(y = checkoutTS$ArrivalDateYear,
m = checkoutTS$ArrivalDateMonth,
d = checkoutTS$ArrivalDateDayOfMonth)
arrival_dateCheckout <- as.data.frame(arrival_dateCheckout)
a = arrival_dateCheckout %>% group_by(arrival_dateCheckout) %>% summarise(n = n())
data <- as.xts(a$n,order.by=as.Date(a$arrival_dateCheckout))
weekly <- apply.weekly(data,sum)
weekly <- as.data.frame(weekly)
#index of a dataframe to columns
library(data.table)
setDT(weekly, keep.rownames = TRUE)[]
weekly$rn <- as.Date(weekly$rn)
arrivalDatePlotCheckout <- weekly
ggplot(data = weekly,
aes(x = rn))+
geom_line(aes(y = V1, color = "Number of Reservations for ArrivalDate of people who Checkout"))+
ylab('Number of Reservations for that Arrival Date of people who Checkout')+
xlab('Date - Weekly')+
theme(legend.position = "none")
arrival_dateCanceled <- yearMonthDay(y = nonCheckoutTS$ArrivalDateYear,
m = nonCheckoutTS$ArrivalDateMonth,
d = nonCheckoutTS$ArrivalDateDayOfMonth)
arrival_dateCanceled <- as.data.frame(arrival_dateCanceled)
a = arrival_dateCanceled %>% group_by(arrival_dateCanceled) %>% summarise(n = n())
data <- as.xts(a$n,order.by=as.Date(a$arrival_dateCanceled))
weekly <- apply.weekly(data,sum)
weekly <- as.data.frame(weekly)
#index of a dataframe to columns
library(data.table)
setDT(weekly, keep.rownames = TRUE)[]
weekly$rn <- as.Date(weekly$rn)
arrivalDatePlotCanceled <- weekly
ggplot(data = weekly,
aes(x = rn))+
geom_line(aes(y = V1, color = "Number of Reservations for ArrivalDate of people who Checkout"))+
ylab('Number of Reservations for that Arrival Date of people who Checkout')+
xlab('Date - Weekly')+
theme(legend.position = "none")
ggplot() +
geom_line(data = arrivalDatePlotCheckout, aes(x = rn, y = V1, col = "Arrival Date Checkout")) +
geom_line(data = arrivalDatePlotCanceled, aes(x = rn, y = V1, col = "Arrival Date Canceled"))+
xlab('Date - Weeks')+
ylab('Number of Reservations')+
ggtitle('Number of Reservations')
reservetion_dateNonCheckout <- yearMonthDay(y = nonCheckoutTS$ArrivalDateYear,
m = nonCheckoutTS$ArrivalDateMonth,
d = nonCheckoutTS$ArrivalDateDayOfMonth)
reservetion_dateNonCheckout <- reservetion_dateNonCheckout - nonCheckoutTS$LeadTime
reservetion_dateNonCheckout <- as.data.frame(reservetion_dateNonCheckout)
a = reservetion_dateNonCheckout %>% group_by(reservetion_dateNonCheckout) %>% summarise(n = n())
data <- as.xts(a$n,order.by=as.Date(a$reservetion_dateNonCheckout))
weekly <- apply.weekly(data,sum)
weekly <- as.data.frame(weekly)
#index of a dataframe to columns
library(data.table)
setDT(weekly, keep.rownames = TRUE)[]
weekly$rn <- as.Date(weekly$rn)
reservetionDatePlotCanceled <- weekly
ggplot(data = weekly,
aes(x = rn))+
geom_line(aes(y = V1, color = "Number of Reservations ReservationDate Canceled"))+
ylab('Number of Reservations ReservationDate Canceled')+
xlab('Date - Weekly')+
theme(legend.position = "none")
reservetion_dateCheckout <- yearMonthDay(y = checkoutTS$ArrivalDateYear,
m = checkoutTS$ArrivalDateMonth,
d = checkoutTS$ArrivalDateDayOfMonth)
reservetion_dateCheckout <- reservetion_dateCheckout - checkoutTS$LeadTime
reservetion_dateCheckout <- as.data.frame(reservetion_dateCheckout)
a = reservetion_dateCheckout %>% group_by(reservetion_dateCheckout) %>% summarise(n = n())
data <- as.xts(a$n,order.by=as.Date(a$reservetion_dateCheckout))
weekly <- apply.weekly(data,sum)
weekly <- as.data.frame(weekly)
#index of a dataframe to columns
library(data.table)
setDT(weekly, keep.rownames = TRUE)[]
weekly$rn <- as.Date(weekly$rn)
reservetionDatePlotCheckout <- weekly
ggplot(data = weekly,
aes(x = rn))+
geom_line(aes(y = V1, color = "Number of Reservations ReservationDate Checkout"))+
ylab('Number of Reservations ReservationDate Checkout')+
xlab('Date - Weekly')+
theme(legend.position = "none")
ggplot() +
geom_line(data = reservetionDatePlotCheckout, aes(x = rn, y = V1, col = "Reservation Date Checkout")) +
geom_line(data = reservetionDatePlotCanceled, aes(x = rn, y = V1, col = "Reservetion Date Canceled"))+
xlab('Date - Weeks')+
ylab('Number of Reservations')+
ggtitle('Number of Reservations')
yearMonthDay <- function(y, m, d){
date <- paste(as.character(y),paste(m, d, sep = "-"), sep = "-")
date <- as.Date(date, "%Y-%m-%d")
}
arrival_date <- yearMonthDay(y = timeSeriesDataSet$ArrivalDateYear,
m = timeSeriesDataSet$ArrivalDateMonth,
d = timeSeriesDataSet$ArrivalDateDayOfMonth)
arrival_dateAdr<- as.data.frame(arrival_date)
arrival_dateAdr$ADR <- raw_data$ADR
colnames(arrival_dateAdr) <- c("arrival_date", "ADR")
a = arrival_dateAdr %>% group_by(arrival_date) %>% summarize(totalMean=mean(ADR))
data <- as.xts(a$totalMean,order.by=as.Date(a$arrival_date))
weekly <- apply.weekly(data,mean)
weekly <- as.data.frame(weekly)
#index of a dataframe to columns
library(data.table)
setDT(weekly, keep.rownames = TRUE)[]
weekly$rn <- as.Date(weekly$rn)
arrivalDatePlotAdr <- weekly
ggplot(data = weekly,
aes(x = rn))+
geom_line(aes(y = V1, color = "ADR for ArrivalDate"))+
ylab('ADR for that Arrival Date')+
xlab('Date - Weekly')+
theme(legend.position = "none")
ggplot() +
geom_line(data = arrivalDatePlot, aes(x = rn, y = V1, col = "Arrival Date")) +
geom_line(data = arrivalDatePlotAdr, aes(x = rn, y = V1, col = "Mean ADR Date"))+
geom_line(data = arrivalDatePlotCheckout, aes(x = rn, y = V1, col = "Arrival Date not Canceled"))+
xlab('Date - Weeks')+
ylab('Number of Reservations')+
ggtitle('Number of Reservations')
#Transform to zoo data (forecast package)
zReservationDateNumber=as.zoo(reservationDatePlot$V1)
zArrivalDateNumber=as.zoo(arrivalDatePlot$V1)
zAdrDateNumber = as.zoo(arrivalDatePlotAdr$V1)
zArrivalDateCanceled = as.zoo(arrivalDatePlotCanceled$V1)
zArrivalDateCheckout = as.zoo(arrivalDatePlotCheckout$V1)
zReservationDateCanceled = as.zoo(reservetionDatePlotCanceled$V1)
zReservationDateCheckout = as.zoo(reservetionDatePlotCheckout$V1)
#Seasonal Plot
ggfreqplot(as.ts(zReservationDateNumber),freq=4,nrow=1,facet.labeller=c("1T","2T","3T","4T"))+ggtitle("Number of Reservations - Reservation Date ")
#Seasonal Plot
ggfreqplot(as.ts(zArrivalDateNumber),freq=4,nrow=1,facet.labeller=c("1T","2T","3T","4T"))+ggtitle("Number of Reservations - Arrival Date ")
#Seasonal Plot
ggfreqplot(as.ts(zAdrDateNumber),freq=4,nrow=1,facet.labeller=c("1T","2T","3T","4T"))+ggtitle("Mean ADR - Arrival Date ")
#Seasonal Plot
ggfreqplot(as.ts(zArrivalDateCanceled),freq=4,nrow=1,facet.labeller=c("1T","2T","3T","4T"))+ggtitle("Mean ADR - Arrival Date ")
#Seasonal Plot
ggfreqplot(as.ts(zArrivalDateCheckout),freq=4,nrow=1,facet.labeller=c("1T","2T","3T","4T"))+ggtitle("Mean ADR - Arrival Date ")
#Seasonal Plot
ggfreqplot(as.ts(zReservationDateCanceled),freq=4,nrow=1,facet.labeller=c("1T","2T","3T","4T"))+ggtitle("Mean ADR - Arrival Date ")
#Seasonal Plot
ggfreqplot(as.ts(zReservationDateCheckout),freq=4,nrow=1,facet.labeller=c("1T","2T","3T","4T"))+ggtitle("Mean ADR - Arrival Date ")
NA
Log transformation and differences
#Select number of observation to compare forecast
cOmit=20
#Data Size
nObsArrival=length(zArrivalDateNumber)
oArrivalDateNumber <- window(zArrivalDateNumber,start=index(zArrivalDateNumber[1]),end=index(zArrivalDateNumber[nObsArrival-cOmit]))
#Data Size
nObsReservation=length(zReservationDateNumber)
oReservationDateNumber <- window(zReservationDateNumber,start=index(zReservationDateNumber[1]),end=index(zReservationDateNumber[nObsReservation-cOmit]))
#Data Size
nObsCanceled=length(zArrivalDateCanceled)
oArrivalDateCanceled <- window(zArrivalDateCanceled,start=index(zArrivalDateCanceled[1]),end=index(zArrivalDateCanceled[nObsCanceled-cOmit]))
#Data Size
nObsCheckout=length(zArrivalDateCheckout)
oArrivalDateCheckout <- window(zArrivalDateCheckout,start=index(zArrivalDateCheckout[1]),end=index(zArrivalDateCheckout[nObsCheckout-cOmit]))
#Data Size
nObsCheckout=length(zReservationDateCanceled)
oReservationDateCanceled <- window(zReservationDateCanceled,start=index(zReservationDateCanceled[1]),end=index(zReservationDateCanceled[nObsCheckout-cOmit]))
#Data Size
nObsCheckout=length(zReservationDateCheckout)
oReservationDateCheckout <- window(zReservationDateCheckout,start=index(zReservationDateCheckout[1]),end=index(zReservationDateCheckout[nObsCheckout-cOmit]))
etsfitArrival<-ets(oArrivalDateNumber, lambda = "auto", use.initial.values = TRUE, nmse = 30)
etsfitReservation<-ets(oReservationDateNumber, lambda = "auto", use.initial.values = TRUE, nmse = 30)
etsfitArrivalCanceled<-ets(oArrivalDateCanceled, lambda = "auto", use.initial.values = TRUE, nmse = 30)
etsfitArrivalCheckout<-ets(oArrivalDateCheckout, lambda = "auto", use.initial.values = TRUE, nmse = 30)
etsfitReservationCanceled<-ets(oReservationDateCanceled, lambda = "auto", use.initial.values = TRUE, nmse = 30)
etsfitReservationCheckout<-ets(oReservationDateCheckout, lambda = "auto", use.initial.values = TRUE, nmse = 30)
#forecast model
fArrival.ets=forecast(etsfitArrival, h = 20)
fReservation.ets=forecast(etsfitReservation, h = 20)
fCanceled.ets=forecast(etsfitArrivalCanceled, h = 20)
fCheckout.ets=forecast(etsfitArrivalCheckout, h = 20)
fCanceledReservation.ets=forecast(etsfitReservationCanceled, h = 20)
fCheckoutReservation.ets=forecast(etsfitReservationCheckout, h = 20)
plot(fArrival.ets)
lines(window(zArrivalDateNumber),type="o")
plot(fReservation.ets)
lines(window(zReservationDateNumber),type="o")
plot(fCanceled.ets)
lines(window(zArrivalDateCanceled),type="o")
plot(fCheckout.ets)
lines(window(zArrivalDateCheckout),type="o")
plot(fCanceledReservation.ets)
lines(window(zReservationDateCanceled),type="o")
plot(fCheckoutReservation.ets)
lines(window(zReservationDateCheckout),type="o")
#Actual and Forecast
error_forecast_actual <- matrix(c(fArrival.ets$mean[1:cOmit],zArrivalDateNumber[(nObsArrival-cOmit+1):nObsArrival]),ncol=2)
#MAE
cat("MAE:",mean(abs(error_forecast_actual[,2]-error_forecast_actual[,1])))#1 forecast, 2 actual.
# [1] 345.3866
cat("\n\n")
#BIAS
bias = sum(error_forecast_actual[,1]-error_forecast_actual[,2]) * 1.0/length(error_forecast_actual[,2])
cat('Bias %:', bias)
#Actual and Forecast
error_forecast_actual <- matrix(c(fReservation.ets$mean[1:cOmit],zReservationDateNumber[(nObsReservation-cOmit+1):nObsReservation]),ncol=2)
#MAE
cat("MAE:",mean(abs(error_forecast_actual[,2]-error_forecast_actual[,1])))#1 forecast, 2 actual.
# [1] 345.3866
cat("\n\n")
#BIAS
bias = sum(error_forecast_actual[,1]-error_forecast_actual[,2]) * 1.0/length(error_forecast_actual[,2])
cat('Bias %:', bias)
#Actual and Forecast
error_forecast_actual <- matrix(c(fCanceled.ets$mean[1:cOmit],zArrivalDateCanceled[(nObsCanceled-cOmit+1):nObsCanceled]),ncol=2)
error_forecast_actualCanceled <- error_forecast_actual
#MAE
cat("MAE:",mean(abs(error_forecast_actual[,2]-error_forecast_actual[,1])))#1 forecast, 2 actual.
# [1] 345.3866
cat("\n\n")
#BIAS
bias = sum(error_forecast_actual[,1]-error_forecast_actual[,2]) * 1.0/length(error_forecast_actual[,2])
cat('Bias %:', bias)
#Actual and Forecast
error_forecast_actual <- matrix(c(fCheckout.ets$mean[1:cOmit],zArrivalDateCheckout[(nObsCheckout-cOmit+1):nObsCheckout]),ncol=2)
error_forecast_actualCheckout <- error_forecast_actual
#MAE
cat("MAE:",mean(abs(error_forecast_actual[,2]-error_forecast_actual[,1])))#1 forecast, 2 actual.
# [1] 345.3866
cat("\n\n")
#BIAS
bias = sum(error_forecast_actual[,1]-error_forecast_actual[,2]) * 1.0/length(error_forecast_actual[,2])
cat('Bias %:', bias)
#Actual and Forecast
error_forecast_actual <- error_forecast_actualCanceled + error_forecast_actualCheckout
#MAE
cat("MAE:",mean(abs(error_forecast_actual[,2]-error_forecast_actual[,1])))#1 forecast, 2 actual.
# [1] 345.3866
cat("\n\n")
#BIAS
bias = sum(error_forecast_actual[,1]-error_forecast_actual[,2]) * 1.0/length(error_forecast_actual[,2])
cat('Bias %:', bias)
#Actual and Forecast
error_forecast_actual <- matrix(c(fReservation.ets$mean[1:cOmit],zReservationDateCanceled[(nObsCanceled-cOmit+1):nObsCanceled]),ncol=2)
error_forecast_actualCanceledReservation <- error_forecast_actual
#MAE
cat("MAE:",mean(abs(error_forecast_actual[,2]-error_forecast_actual[,1])))#1 forecast, 2 actual.
MAE: 79.7721
# [1] 345.3866
cat("\n\n")
#BIAS
bias = sum(error_forecast_actual[,1]-error_forecast_actual[,2]) * 1.0/length(error_forecast_actual[,2])
cat('Bias %:', bias)
Bias %: 79.7721
#Actual and Forecast
error_forecast_actual <- matrix(c(fCheckoutReservation.ets$mean[1:cOmit],zReservationDateCheckout[(nObsCheckout-cOmit+1):nObsCheckout]),ncol=2)
error_forecast_actualCheckoutReservation <- error_forecast_actual
#MAE
cat("MAE:",mean(abs(error_forecast_actual[,2]-error_forecast_actual[,1])))#1 forecast, 2 actual.
MAE: 35.35768
# [1] 345.3866
cat("\n\n")
#BIAS
bias = sum(error_forecast_actual[,1]-error_forecast_actual[,2]) * 1.0/length(error_forecast_actual[,2])
cat('Bias %:', bias)
Bias %: 27.46537
#Actual and Forecast
error_forecast_actual <- error_forecast_actualCanceledReservation + error_forecast_actualCheckoutReservation
#MAE
cat("MAE:",mean(abs(error_forecast_actual[,2]-error_forecast_actual[,1])))#1 forecast, 2 actual.
MAE: 107.2375
# [1] 345.3866
cat("\n\n")
#BIAS
bias = sum(error_forecast_actual[,1]-error_forecast_actual[,2]) * 1.0/length(error_forecast_actual[,2])
cat('Bias %:', bias)
Bias %: 107.2375
# fit1Arrival=auto.arima(oArrivalDateNumber,lambda = 0)
fit1Arrival=auto.arima(oArrivalDateNumber, lambda = 0, start.p = 0, start.Q = 0, max.p = 10, max.q = 10, start.P = 0, start.q = 0,max.P = 10, max.Q = 10, max.d = 52, max.D = 52,seasonal = T)
summary(fit1Arrival)
#residual analysis
ggtsdisplay(fit1Arrival$residuals)
#box-Ljung Test, 3 porqué hemos estimado 3.
Box.test(fit1Arrival$residuals,lag=1, fitdf=3, type="Lj")
# Box.test(fit1Arrival$residuals,lag=8, fitdf=3, type="Lj")
# Box.test(fit1Arrival$residuals,lag=12, fitdf=3, type="Lj")
fit1Reservation=auto.arima(oReservationDateNumber, lambda = 0, start.p = 0, start.Q = 0, max.p = 10, max.q = 10, start.P = 0, start.q = 0,max.P = 10, max.Q = 10, max.d = 52, max.D = 52,seasonal = T)
summary(fit1Reservation)
#residual analysis
ggtsdisplay(fit1Reservation$residuals)
#box-Ljung Test, 3 porqué hemos estimado 3.
Box.test(fit1Reservation$residuals,lag=4, fitdf=3, type="Lj")
Box.test(fit1Reservation$residuals,lag=8, fitdf=3, type="Lj")
Box.test(fit1Reservation$residuals,lag=12, fitdf=3, type="Lj")
fit1Canceled=auto.arima(oArrivalDateCanceled, lambda = 0, start.p = 0, start.Q = 0, max.p = 10, max.q = 10, start.P = 0, start.q = 0,max.P = 10, max.Q = 10, max.d = 52, max.D = 52,seasonal = T)
summary(fit1Canceled)
#residual analysis
ggtsdisplay(fit1Canceled$residuals)
#box-Ljung Test, 3 porqué hemos estimado 3.
Box.test(fit1Canceled$residuals,lag=4, fitdf=3, type="Lj")
Box.test(fit1Canceled$residuals,lag=8, fitdf=3, type="Lj")
Box.test(fit1Canceled$residuals,lag=12, fitdf=3, type="Lj")
fit1Checkout=auto.arima(oArrivalDateCheckout, lambda = 0, start.p = 0, start.Q = 0, max.p = 10, max.q = 10, start.P = 0, start.q = 0,max.P = 10, max.Q = 10, max.d = 52, max.D = 52,seasonal = T)
summary(fit1Checkout)
#residual analysis
ggtsdisplay(fit1Checkout$residuals)
#box-Ljung Test, 3 porqué hemos estimado 3.
Box.test(fit1Checkout$residuals,lag=4, fitdf=3, type="Lj")
Box.test(fit1Checkout$residuals,lag=8, fitdf=3, type="Lj")
Box.test(fit1Checkout$residuals,lag=12, fitdf=3, type="Lj")
fit1CanceledReservation=auto.arima(oReservationDateCanceled, lambda = 0, start.p = 0, start.Q = 0, max.p = 10, max.q = 10, start.P = 0, start.q = 0,max.P = 10, max.Q = 10, max.d = 52, max.D = 52,seasonal = T)
summary(fit1CanceledReservation)
Series: oReservationDateCanceled
ARIMA(1,1,1)
Box Cox transformation: lambda= 0
Coefficients:
ar1 ma1
0.4577 -0.7862
s.e. 0.1439 0.1009
sigma^2 estimated as 0.4038: log likelihood=-122.73
AIC=251.45 AICc=251.64 BIC=260.01
Training set error measures:
ME RMSE MAE MPE MAPE
Training set 7.127027 46.2998 28.90803 -8.435588 43.42391
MASE ACF1
Training set 0.8307652 -0.1353022
#residual analysis
ggtsdisplay(fit1CanceledReservation$residuals)
#box-Ljung Test, 3 porqué hemos estimado 3.
Box.test(fit1CanceledReservation$residuals,lag=4, fitdf=3, type="Lj")
Box-Ljung test
data: fit1CanceledReservation$residuals
X-squared = 6.168, df = 1, p-value = 0.01301
Box.test(fit1CanceledReservation$residuals,lag=8, fitdf=3, type="Lj")
Box-Ljung test
data: fit1CanceledReservation$residuals
X-squared = 12.839, df = 5, p-value = 0.02493
Box.test(fit1CanceledReservation$residuals,lag=12, fitdf=3, type="Lj")
Box-Ljung test
data: fit1CanceledReservation$residuals
X-squared = 18.765, df = 9, p-value = 0.02726
fit1CheckoutReservation=auto.arima(oReservationDateCheckout, lambda = 0, start.p = 0, start.Q = 0, max.p = 10, max.q = 10, start.P = 0, start.q = 0,max.P = 10, max.Q = 10, max.d = 52, max.D = 52,seasonal = T)
summary(fit1CheckoutReservation)
Series: oReservationDateCheckout
ARIMA(0,1,1)
Box Cox transformation: lambda= 0
Coefficients:
ma1
-0.3107
s.e. 0.0879
sigma^2 estimated as 0.3172: log likelihood=-117.82
AIC=239.63 AICc=239.72 BIC=245.52
Training set error measures:
ME RMSE MAE MPE MAPE
Training set 6.638264 92.11325 51.90594 -7.965977 34.87243
MASE ACF1
Training set 0.9177611 -0.1112676
#residual analysis
ggtsdisplay(fit1CheckoutReservation$residuals)
#box-Ljung Test, 3 porqué hemos estimado 3.
Box.test(fit1CheckoutReservation$residuals,lag=4, fitdf=3, type="Lj")
Box-Ljung test
data: fit1CheckoutReservation$residuals
X-squared = 6.3696, df = 1, p-value = 0.01161
Box.test(fit1CheckoutReservation$residuals,lag=8, fitdf=3, type="Lj")
Box-Ljung test
data: fit1CheckoutReservation$residuals
X-squared = 14.4, df = 5, p-value = 0.01326
Box.test(fit1CheckoutReservation$residuals,lag=12, fitdf=3, type="Lj")
Box-Ljung test
data: fit1CheckoutReservation$residuals
X-squared = 20.795, df = 9, p-value = 0.01359
fArrival.arima=forecast(fit1Arrival, h = 20)
plot(fArrival.arima)
lines(window(zArrivalDateNumber),type="o")
res_ArimaMatrix <- matrix(c(fArrival.arima$mean[1:20],
as.double(tail(zArrivalDateNumber,n=20))),
ncol = 2)
res_ArimaMatrix
fReservation.arima=forecast(fit1Reservation, h = 20)
plot(fReservation.arima)
lines(window(zReservationDateNumber),type="o")
res_ArimaMatrix <- matrix(c(fReservation.arima$mean[1:20],
as.double(tail(zReservationDateNumber,n=20))),
ncol = 2)
res_ArimaMatrix
[,1] [,2]
[1,] 20.29468 183
[2,] 17.17760 208
[3,] 19.30864 197
[4,] 17.78822 246
[5,] 18.84139 191
[6,] 18.09646 153
[7,] 18.61574 166
[8,] 18.25002 141
[9,] 18.50574 133
[10,] 18.32603 151
[11,] 18.45188 152
[12,] 18.36353 173
[13,] 18.42545 170
[14,] 18.38200 120
[15,] 18.41246 105
[16,] 18.39109 106
[17,] 18.40608 78
[18,] 18.39557 77
[19,] 18.40294 54
[20,] 18.39777 16
fCanceled.arima=forecast(fit1Canceled, h = 20)
plot(fCanceled.arima)
lines(window(zArrivalDateCanceled),type="o")
res_ArimaMatrix <- matrix(c(fCanceled.arima$mean[1:20],
as.double(tail(zArrivalDateCanceled,n=20))),
ncol = 2)
res_ArimaMatrix
fCheckout.arima=forecast(fit1Checkout, h = 20)
plot(fCheckout.arima)
lines(window(zArrivalDateCheckout),type="o")
res_ArimaMatrix <- matrix(c(fCheckout.arima$mean[1:20],
as.double(tail(zArrivalDateCheckout,n=20))),
ncol = 2)
res_ArimaMatrix
[,1] [,2]
[1,] 249.5299 295
[2,] 243.9061 292
[3,] 238.9388 257
[4,] 234.8387 278
[5,] 231.7487 240
[6,] 229.7488 337
[7,] 228.8620 284
[8,] 229.0601 251
[9,] 230.2687 221
[10,] 232.3721 227
[11,] 235.2175 246
[12,] 238.6206 240
[13,] 242.3728 279
[14,] 246.2497 221
[15,] 250.0239 236
[16,] 253.4773 255
[17,] 256.4163 244
[18,] 258.6851 261
[19,] 260.1764 250
[20,] 260.8388 131
fCanceled.arimaReservation=forecast(fit1CanceledReservation, h = 20)
plot(fCanceled.arimaReservation)
lines(window(zReservationDateCanceled),type="o")
res_ArimaMatrix <- matrix(c(fCanceled.arimaReservation$mean[1:20],
as.double(tail(zReservationDateCanceled,n=20))),
ncol = 2)
res_ArimaMatrix
[,1] [,2]
[1,] 61.85637 45
[2,] 66.38913 44
[3,] 68.57294 42
[4,] 69.59620 62
[5,] 70.06957 83
[6,] 70.28729 58
[7,] 70.38716 37
[8,] 70.43291 37
[9,] 70.45385 43
[10,] 70.46344 37
[11,] 70.46783 45
[12,] 70.46984 44
[13,] 70.47076 54
[14,] 70.47118 49
[15,] 70.47137 24
[16,] 70.47146 21
[17,] 70.47150 22
[18,] 70.47152 18
[19,] 70.47153 16
[20,] 70.47153 5
fCheckout.arimaReservation=forecast(fit1CheckoutReservation, h = 20)
plot(fCheckout.arimaReservation)
lines(window(zReservationDateCheckout),type="o")
res_ArimaMatrix <- matrix(c(fCheckout.arimaReservation$mean[1:20],
as.double(tail(zReservationDateCheckout,n=20))),
ncol = 2)
res_ArimaMatrix
[,1] [,2]
[1,] 128.2248 139
[2,] 128.2248 166
[3,] 128.2248 135
[4,] 128.2248 163
[5,] 128.2248 133
[6,] 128.2248 116
[7,] 128.2248 129
[8,] 128.2248 98
[9,] 128.2248 96
[10,] 128.2248 106
[11,] 128.2248 108
[12,] 128.2248 119
[13,] 128.2248 121
[14,] 128.2248 96
[15,] 128.2248 84
[16,] 128.2248 84
[17,] 128.2248 60
[18,] 128.2248 61
[19,] 128.2248 49
[20,] 128.2248 16
error_forecast<- as.data.frame(fArrival.arima$mean[1:20])#forecast
error_actual <- as.data.frame(tail(zArrivalDateNumber,n=20))#actual
dim(error_actual)[1]
[1] 20
error_actual_total <- error_actual
#MAE
cat("MAE:",mean(as.matrix(abs(error_forecast - error_actual))))#1 forecast, 2 actual.
MAE: 60.74243
cat("\n\n")
#BIAS
bias = sum(as.matrix(error_forecast - error_actual)) * 1.0/dim(error_actual)[1]
cat('Bias %:', bias)
Bias %: -45.46741
error_forecast<- as.data.frame(fReservation.arima$mean[1:20])#forecast
error_actual <- as.data.frame(tail(zReservationDateNumber,n=20))#actual
dim(error_actual)[1]
[1] 20
error_actual_total <- error_actual
#MAE
cat("MAE:",mean(as.matrix(abs(error_forecast - error_actual))))#1 forecast, 2 actual.
MAE: 44.90987
cat("\n\n")
#BIAS
bias = sum(as.matrix(error_forecast - error_actual)) * 1.0/dim(error_actual)[1]
cat('Bias %:', bias)
Bias %: 22.71995
error_forecastCanceled<- as.data.frame(fCanceled.arima$mean[1:20])#forecast
error_actualCanceled <- as.data.frame(tail(zArrivalDateCanceled,n=20))#actual
dim(error_actualCanceled)[1]
error_actual_total <- error_actualCanceled
#MAE
cat("MAE:",mean(as.matrix(abs(error_forecastCanceled - error_actualCanceled))))#1 forecast, 2 actual.
cat("\n\n")
#BIAS
bias = sum(as.matrix(error_forecastCanceled - error_actualCanceled)) * 1.0/dim(error_actualCanceled)[1]
cat('Bias %:', bias)
error_forecastCheckout<- as.data.frame(fCheckout.arima$mean[1:20])#forecast
error_actualCheckout <- as.data.frame(tail(zArrivalDateCheckout,n=20))#actual
dim(error_actualCheckout)[1]
error_actual_total <- error_actualCheckout
#MAE
cat("MAE:",mean(as.matrix(abs(error_forecastCheckout - error_actualCheckout))))#1 forecast, 2 actual.
cat("\n\n")
#BIAS
bias = sum(as.matrix(error_forecastCheckout - error_actualCheckout)) * 1.0/dim(error_actualCheckout)[1]
cat('Bias %:', bias)
error_forecast<- error_forecastCanceled + error_forecastCheckout
error_actual <- error_actualCanceled + error_actualCheckout
dim(error_actual)[1]
[1] 20
error_actual_total <- error_actual
#MAE
cat("MAE:",mean(as.matrix(abs(error_forecast - error_actual))))#1 forecast, 2 actual.
MAE: 60.66275
cat("\n\n")
#BIAS
bias = sum(as.matrix(error_forecast - error_actual)) * 1.0/dim(error_actual)[1]
cat('Bias %:', bias)
Bias %: 54.63568
error_forecastCanceled<- as.data.frame(fCanceled.arimaReservation$mean[1:20])#forecast
error_actualCanceled <- as.data.frame(tail(zReservationDateCanceled,n=20))#actual
dim(error_actualCanceled)[1]
[1] 20
error_actual_total <- error_actualCanceled
#MAE
cat("MAE:",mean(as.matrix(abs(error_forecastCanceled - error_actualCanceled))))#1 forecast, 2 actual.
MAE: 31.65391
cat("\n\n")
#BIAS
bias = sum(as.matrix(error_forecastCanceled - error_actualCanceled)) * 1.0/dim(error_actualCanceled)[1]
cat('Bias %:', bias)
Bias %: 30.36087
error_forecastCheckout<- as.data.frame(fCheckout.arimaReservation$mean[1:20])#forecast
error_actualCheckout <- as.data.frame(tail(zReservationDateCheckout,n=20))#actual
dim(error_actualCheckout)[1]
[1] 20
error_actual_total <- error_actualCheckout
#MAE
cat("MAE:",mean(as.matrix(abs(error_forecastCheckout - error_actualCheckout))))#1 forecast, 2 actual.
MAE: 33.83992
cat("\n\n")
#BIAS
bias = sum(as.matrix(error_forecastCheckout - error_actualCheckout)) * 1.0/dim(error_actualCheckout)[1]
cat('Bias %:', bias)
Bias %: 24.27481
error_forecast<- error_forecastCanceled + error_forecastCheckout
error_actual <- error_actualCanceled + error_actualCheckout
dim(error_actual)[1]
[1] 20
error_actual_total <- error_actual
#MAE
cat("MAE:",mean(as.matrix(abs(error_forecast - error_actual))))#1 forecast, 2 actual.
MAE: 60.66275
cat("\n\n")
#BIAS
bias = sum(as.matrix(error_forecast - error_actual)) * 1.0/dim(error_actual)[1]
cat('Bias %:', bias)
Bias %: 54.63568
etsfitArrivalCanceled<-ets(zArrivalDateCanceled[-length(zArrivalDateCanceled)], lambda = "auto", use.initial.values = TRUE, nmse = 30)
etsfitArrivalCheckout<-ets(zArrivalDateCheckout[-length(zArrivalDateCheckout)], lambda = "auto", use.initial.values = TRUE, nmse = 30)
fit1Reservation=auto.arima(zReservationDateNumber, lambda = 0, start.p = 0, start.Q = 0, max.p = 10, max.q = 10, start.P = 0, start.q = 0,max.P = 10, max.Q = 10, max.d = 52, max.D = 52,seasonal = T)
#forecast model
fCanceled.ets=forecast(etsfitArrivalCanceled, h = 20)
fCheckout.ets=forecast(etsfitArrivalCheckout, h = 20)
plot(fCanceled.ets$mean + fCheckout.ets$mean, xlim = c(0,150), ylim = c(100, 600), main = 'Final Prediction', ylab = 'Number of Reservations for Arrival Date')
lines(window(zArrivalDateNumber[-length(zArrivalDateNumber)]),type="o")
fReservation.arima=forecast(fit1Reservation, h = 20)
plot(fReservation.arima)
lines(window(zReservationDateNumber),type="o")
res_ArimaMatrix <- matrix(c(fReservation.arima$mean[1:20],
as.double(tail(zReservationDateNumber,n=20))),
ncol = 2)
res_ArimaMatrix
[,1] [,2]
[1,] 20.29468 183
[2,] 17.17760 208
[3,] 19.30864 197
[4,] 17.78822 246
[5,] 18.84139 191
[6,] 18.09646 153
[7,] 18.61574 166
[8,] 18.25002 141
[9,] 18.50574 133
[10,] 18.32603 151
[11,] 18.45188 152
[12,] 18.36353 173
[13,] 18.42545 170
[14,] 18.38200 120
[15,] 18.41246 105
[16,] 18.39109 106
[17,] 18.40608 78
[18,] 18.39557 77
[19,] 18.40294 54
[20,] 18.39777 16